[Intro goes here]
import pandas as pd
import MeteoAPI
import importlib as imp
import geopandas as gpd
import pydeck as pdk
import numpy as np
import spectra
import GPy
import itertools as itt
from scipy.stats import norm
/home/aurimas/apps/anaconda3/envs/gp-processes/lib/python3.8/site-packages/geopandas/_compat.py:111: UserWarning: The Shapely GEOS version (3.9.1-CAPI-1.14.2) is incompatible with the GEOS version PyGEOS was compiled with (3.10.1-CAPI-1.16.0). Conversions between both will be slow. warnings.warn(
imp.reload(MeteoAPI)
#Setup the API object
meteoApi = MeteoAPI.MeteoAPI(use_cache = True, cache_dir = "meteo-cache")
#Get all available places
places = meteoApi.get_places()
#Get all forecasts and save as a single dataframe
all_forecasts = [meteoApi.get_forecast(p) for p in places]
merged_forecasts = pd.concat(all_forecasts)
#remove columns that are not interesting for us
drop_cols = ['seaLevelPressure', 'relativeHumidity', 'windDirection', 'windGust', 'conditionCode']
forecasts = merged_forecasts.drop(drop_cols, axis=1)
#clip to Lithuania's borders
lithuania = gpd.read_file('lt-shapefile/lt.shp')
lt_forecasts = gpd.clip(forecasts, lithuania).reset_index(drop=True)
lt_forecasts.to_file("lt_forecasts.geojson", driver='GeoJSON', index=False)
lt_forecasts = gpd.read_file("lt_forecasts.geojson")
lt_forecasts.head(n=10)
| forecastTimeUtc | airTemperature | windSpeed | cloudCover | totalPrecipitation | location | geometry | |
|---|---|---|---|---|---|---|---|
| 0 | 2022-02-01T06:00:00 | -2.8 | 4.0 | 18.0 | 0.0 | Senoji Radiškė | POINT (23.25380 54.29488) |
| 1 | 2022-02-04T21:00:00 | 1.7 | 5.0 | 100.0 | 1.2 | Senoji Radiškė | POINT (23.25380 54.29488) |
| 2 | 2022-02-06T18:00:00 | 2.4 | 7.0 | 97.0 | 3.5 | Senoji Radiškė | POINT (23.25380 54.29488) |
| 3 | 2022-02-06T12:00:00 | 2.1 | 5.0 | 100.0 | 1.0 | Senoji Radiškė | POINT (23.25380 54.29488) |
| 4 | 2022-02-06T06:00:00 | 0.3 | 7.0 | 100.0 | 0.9 | Senoji Radiškė | POINT (23.25380 54.29488) |
| 5 | 2022-02-06T00:00:00 | 0.3 | 5.0 | 76.0 | 1.2 | Senoji Radiškė | POINT (23.25380 54.29488) |
| 6 | 2022-02-05T18:00:00 | 0.1 | 5.0 | 43.0 | 0.0 | Senoji Radiškė | POINT (23.25380 54.29488) |
| 7 | 2022-01-31T05:00:00 | -1.3 | 9.0 | 12.0 | 0.0 | Senoji Radiškė | POINT (23.25380 54.29488) |
| 8 | 2022-01-31T06:00:00 | -1.5 | 9.0 | 9.0 | 0.0 | Senoji Radiškė | POINT (23.25380 54.29488) |
| 9 | 2022-01-31T07:00:00 | -1.7 | 9.0 | 4.0 | 0.0 | Senoji Radiškė | POINT (23.25380 54.29488) |
fc_time = lt_forecasts.loc[0, 'forecastTimeUtc']
morning_forecast = lt_forecasts[lt_forecasts['forecastTimeUtc'] == fc_time].reset_index(drop=True)
print("Selected forecast time: {}".format(fc_time))
Selected forecast time: 2022-02-01T06:00:00
#determine initial map view
bounds = morning_forecast.total_bounds
lt_view = pdk.ViewState(latitude=(bounds[1] + bounds[3]) / 2, longitude=(bounds[0] + bounds[2]) / 2, zoom=6)
colors = ["#d7191c", "#ffffbf", "#1a9641"]
#create helper columns in dataframe for coloring purposes
colorscale = spectra.scale(colors)
temperatures = morning_forecast['airTemperature'].values
scaled_temps = (temperatures - np.min(temperatures)) / (np.max(temperatures) - np.min(temperatures))
scaled_colors = [[c * 255 for c in colorscale(t).clamped_rgb] for t in scaled_temps]
morning_forecast['color'] = scaled_colors
point_layer = pdk.Layer("ScatterplotLayer", morning_forecast,
pickable=True, filled=True, radius_min_pixels=2,
get_position="geometry.coordinates",
get_radius="airTemperature",
get_fill_color='color',
)
tooltip = "{location}\nTemperature: {airTemperature}\n Wind speed: {windSpeed}"
sc_plot = pdk.Deck(layers=[point_layer], tooltip={"text": tooltip}, initial_view_state=lt_view)
sc_plot.to_html()
#organize data into predictors and independent variables
coordinates = np.zeros((len(morning_forecast),2))
coordinates[:,0] = morning_forecast.geometry.x.values
coordinates[:,1] = morning_forecast.geometry.y.values
actual_temperatures = temperatures.reshape(-1, 1)
#fit the simplest GP model
interpolation_model = GPy.models.GPRegression(X=coordinates, Y=actual_temperatures)
interpolation_model.optimize_restarts(num_restarts = 1)
Optimization restart 1/1, f = -137.91647725935127
[<paramz.optimization.optimization.opt_lbfgsb at 0x7fb9db2c0400>]
x_space = np.linspace(coordinates[:,0].min(), coordinates[:,0].max(), 100)
y_space = np.linspace(coordinates[:,1].min(), coordinates[:,1].max(), 100)
xx, yy = np.meshgrid(x_space, y_space)
grid = np.concatenate((xx.reshape(-1,1), yy.reshape(-1, 1)), axis=1)
p_means, p_vars = interpolation_model.predict(grid)
points = gpd.points_from_xy(grid[:,0], grid[:,1], crs='EPSG:4326')
interpolated = gpd.GeoDataFrame(p_means, columns=['temp'], geometry=points)
interpolated['scaled_temp'] = (p_means - p_means.min()) / (p_means.max() - p_means.min())
lt_interpolated = gpd.clip(interpolated, lithuania).reset_index(drop=True)
lt_interpolated.head()
| temp | geometry | scaled_temp | |
|---|---|---|---|
| 0 | -1.733825 | POINT (22.05330 55.05224) | 0.656697 |
| 1 | -1.632037 | POINT (22.05330 55.07663) | 0.665867 |
| 2 | -2.008786 | POINT (23.62212 54.00375) | 0.631923 |
| 3 | -1.706434 | POINT (23.50591 54.00375) | 0.659164 |
| 4 | -2.110134 | POINT (23.68022 54.00375) | 0.622792 |
heatLayer = pdk.Layer(
'HeatmapLayer',
lt_interpolated,
opacity=0.5,
get_position="geometry.coordinates",
aggregation=pdk.types.String("MEAN"),
color_range=[[c*255 for c in colorscale(i).clamped_rgb] for i in [0,0.5,1]],
threshold=1,
get_weight="scaled_temp",
pickable=True,
)
r = pdk.Deck(layers=[heatLayer,point_layer], initial_view_state=lt_view)
r.to_html()
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import mean_absolute_error
train_coords, test_coords, train_temps, test_temps = tts(coordinates, actual_temperatures, random_state=42)
#small helper function to get training and test set errors
def get_errors(models, X_train, X_test, Y_train, Y_test):
results = []
for label, model in models.items():
training_error = mean_absolute_error(Y_train, model.predict(X_train)[0])
test_error = mean_absolute_error(Y_test, model.predict(X_test)[0])
results.append((label, training_error, test_error))
return pd.DataFrame(results, columns=['model', 'Training MAE', 'Test MAE'])
models = {
'basic-GP': GPy.models.GPRegression(X=train_coords, Y=train_temps)
}
models['basic-GP'].optimize_restarts(num_restarts=1, verbose=False)
[<paramz.optimization.optimization.opt_lbfgsb at 0x7fb9d1309940>]
get_errors(models, train_coords, test_coords, train_temps, test_temps)
| model | Training MAE | Test MAE | |
|---|---|---|---|
| 0 | basic-GP | 0.095522 | 0.122836 |
conf_ints = pd.DataFrame([0.5, 0.7, 0.9, 0.95, 0.99], columns=['confidence level'])
conf_ints['total observations'] = len(p_means)
def count_in_range(cint):
factor = -norm.ppf((1 - cint) / 2)
side_range = np.sqrt(p_vars) * factor
return np.sum((test_temps <= (p_means + side_range)) & (test_temps >= (p_means - side_range)))
conf_ints['observations in interval'] = conf_ints['confidence level'].apply(count_in_range)
conf_ints['% observations in interval'] = conf_ints['observations in interval'] / conf_ints['total observations']
conf_ints
| confidence level | total observations | observations in interval | % observations in interval | |
|---|---|---|---|---|
| 0 | 0.50 | 513 | 290 | 0.565302 |
| 1 | 0.70 | 513 | 393 | 0.766082 |
| 2 | 0.90 | 513 | 479 | 0.933723 |
| 3 | 0.95 | 513 | 495 | 0.964912 |
| 4 | 0.99 | 513 | 509 | 0.992203 |
matern_kernel = GPy.kern.Matern52(input_dim=2)
linear_kernel = GPy.kern.Linear(input_dim=2)
models['Matern52 kernel'] = GPy.models.GPRegression(X=train_coords, Y=train_temps, kernel = matern_kernel)
models['Matern52 kernel'].optimize_restarts(num_restarts=1, verbose=False)
models['Matern52 + linear kernel'] = GPy.models.GPRegression(X=train_coords, Y=train_temps, kernel = matern_kernel + linear_kernel)
models['Matern52 + linear kernel'].optimize_restarts(num_restarts=1, verbose=False)
[<paramz.optimization.optimization.opt_lbfgsb at 0x7fb9cde2a130>]
get_errors(models, train_coords, test_coords, train_temps, test_temps)
| model | Training MAE | Test MAE | |
|---|---|---|---|
| 0 | basic-GP | 0.095522 | 0.122836 |
| 1 | Matern52 kernel | 0.055417 | 0.099285 |
| 2 | Matern52 + linear kernel | 0.044996 | 0.097688 |
pd.options.mode.chained_assignment = None # default='warn'
point_forecast = lt_forecasts[lt_forecasts['location'] == "Kaunas"]
point_forecast['forecastTimeUtc'] = pd.to_datetime(point_forecast['forecastTimeUtc'])
point_forecast['hours_offset'] = (point_forecast['forecastTimeUtc'] - point_forecast['forecastTimeUtc'].min()).astype("timedelta64[h]")
first_two_days = point_forecast.sort_values(['forecastTimeUtc']).head(48)
import altair as alt
alt.Chart(first_two_days).mark_line().encode(
x = alt.X("yearmonthdatehours(forecastTimeUtc):O", title="Time"),
y = "airTemperature").properties(
title="Temperature in Kaunas",
width=600)
hours_offset = first_two_days['hours_offset'].values.reshape(-1, 1)
temperature = first_two_days['airTemperature'].values.reshape(-1, 1)
forecast_kernel = (GPy.kern.PeriodicExponential(input_dim=1) + GPy.kern.Linear(input_dim=1))
forecast_model = GPy.models.GPRegression(X=hours_offset, Y=temperature, kernel = forecast_kernel, normalizer=True)
forecast_model.optimize_restarts(num_restarts=1, verbose=False)
[<paramz.optimization.optimization.opt_lbfgsb at 0x7fb9cc827160>]
def plot_temp_forecast(model):
hours = np.fromiter(range(0,80), float).reshape(-1, 1)
predicted_mean, predicted_var = model.predict(hours)
temp_forecasts = pd.DataFrame({
'hours': hours.reshape(-1),
'temperature' : predicted_mean.reshape(-1),
'variance': predicted_var.reshape(-1)
})
temp_forecasts['lower'] = temp_forecasts['temperature'] - np.sqrt(temp_forecasts['variance']) * 1.96
temp_forecasts['upper'] = temp_forecasts['temperature'] + np.sqrt(temp_forecasts['variance']) * 1.96
fact_chart = alt.Chart(first_two_days).mark_line().encode(
x=alt.X('hours_offset', title="hours"),
y=alt.Y('airTemperature', title="temperature")
)
forecast_chart = alt.Chart(temp_forecasts).mark_line(color='red').encode(
x=alt.X('hours', title="hours"),
y=alt.Y('temperature', title="temperature")
)
forecast_confidence = alt.Chart(temp_forecasts).mark_area(color='red', opacity=0.2).encode(
x=alt.X('hours', title="hours"),
y=alt.Y('lower', title="temperature"),
y2=alt.Y2('upper', title="temperature")
)
return (fact_chart + forecast_chart + forecast_confidence).properties(title="Temperature forecast")
plot_temp_forecast(forecast_model)
forecast_model.optimize_restarts(num_restarts=10, verbose=False)
plot_temp_forecast(forecast_model)
/home/aurimas/apps/anaconda3/envs/gp-processes/lib/python3.8/site-packages/GPy/kern/src/periodic.py:86: RuntimeWarning:overflow encountered in true_divide /home/aurimas/apps/anaconda3/envs/gp-processes/lib/python3.8/site-packages/GPy/kern/src/periodic.py:40: RuntimeWarning:invalid value encountered in multiply /home/aurimas/apps/anaconda3/envs/gp-processes/lib/python3.8/site-packages/GPy/kern/src/periodic.py:40: RuntimeWarning:invalid value encountered in cos
morning_forecast.corr()
| airTemperature | windSpeed | cloudCover | totalPrecipitation | |
|---|---|---|---|---|
| airTemperature | 1.000000 | 0.168348 | 0.385487 | -0.054599 |
| windSpeed | 0.168348 | 1.000000 | -0.135505 | -0.012198 |
| cloudCover | 0.385487 | -0.135505 | 1.000000 | 0.175226 |
| totalPrecipitation | -0.054599 | -0.012198 | 0.175226 | 1.000000 |
#organize data into predictors and independent variables
y_variables = ['windSpeed', 'airTemperature']
no_y_variables = len(y_variables)
actual_obs = morning_forecast[y_variables].apply(lambda x: (x - x.min())/(x.max() - x.min()), axis=0).values
train_coords, test_coords, train_obs, test_obs = tts(coordinates, actual_obs, random_state=72, train_size=1500)
tr_coords = [train_coords[:1000, :], train_coords[500:, :]]
tr_obs = [train_obs[:1000, 0, None], train_obs[500:, 1, None]]
result_bag = []
for i, name in enumerate(y_variables):
train_x = tr_coords[i]
train_y = tr_obs[i]
test_x = train_coords
test_y = train_obs[:, i, None]
singleOutput_model = GPy.models.GPRegression(X=train_x, Y=train_y)
singleOutput_model.optimize_restarts(num_restarts=1, verbose=False)
preds = singleOutput_model.predict(test_x)
test_error = mean_absolute_error(test_y, preds[0])
result_bag.append((name, 'single-output', test_error))
pd.DataFrame(result_bag, columns=['Variable', 'Model type', 'Test MAE'])
| Variable | Model type | Test MAE | |
|---|---|---|---|
| 0 | windSpeed | single-output | 0.051657 |
| 1 | airTemperature | single-output | 0.015650 |
multiOutput_model = GPy.models.GPCoregionalizedRegression(tr_coords, tr_obs)
multiOutput_model.optimize_restarts(num_restarts=5, verbose=False)
[<paramz.optimization.optimization.opt_lbfgsb at 0x7f5f47a61b20>, <paramz.optimization.optimization.opt_lbfgsb at 0x7f5f41a0d160>, <paramz.optimization.optimization.opt_lbfgsb at 0x7f5f433b60d0>, <paramz.optimization.optimization.opt_lbfgsb at 0x7f5f3effc070>, <paramz.optimization.optimization.opt_lbfgsb at 0x7f5f9a8b7d90>]
inds = np.ones((train_coords.shape[0], 1))
predSpace = np.vstack([np.hstack([train_coords, inds * i]) for i in range(no_y_variables)])
noise_dict = {'output_index': predSpace[:, 2].astype(int)}
obs_preds = multiOutput_model.predict(predSpace, Y_metadata=noise_dict)
result_bag = []
for i, name in enumerate(y_variables):
test_y = train_obs[:, i, None]
pred_y = obs_preds[0][predSpace[:, 2] == i]
test_error = mean_absolute_error(test_y, pred_y)
result_bag.append((name, 'multi-output', test_error))
pd.DataFrame(result_bag, columns=['Variable', 'Model type', 'Test MAE'])
| Variable | Model type | Test MAE | |
|---|---|---|---|
| 0 | windSpeed | multi-output | 0.053533 |
| 1 | airTemperature | multi-output | 0.014418 |
rng = np.random.default_rng(seed=42)
vp = morning_forecast['windSpeed'].values ** 0.16
t = morning_forecast['airTemperature'].values
morning_forecast['windChill'] = 13.12 + 0.6215 * t - 11.37 * vp + 0.3965 * t * vp - rng.normal(size=len(morning_forecast))
morning_forecast['norm_windChill'] = (morning_forecast['windChill'] - morning_forecast['windChill'].min()) / (morning_forecast['windChill'].max() - morning_forecast['windChill'].min())
coldest_place = morning_forecast[morning_forecast['windChill'] == morning_forecast['windChill'].min()].loc[:, ['location', 'geometry', "windChill"]]
coldest_place
| location | geometry | windChill | |
|---|---|---|---|
| 1769 | Kirmėliai | POINT (24.69512 55.57395) | -11.586682 |
chill_layer = pdk.Layer(
'HeatmapLayer',
morning_forecast,
opacity=0.5,
get_position="geometry.coordinates",
aggregation=pdk.types.String("MEAN"),
color_range=[[c*255 for c in colorscale(i).clamped_rgb] for i in [0,0.5,1]],
threshold=1,
get_weight="norm_windChill",
pickable=False,
)
top_chill_layer = pdk.Layer("ScatterplotLayer", coldest_place,
pickable=True, filled=True, radius_min_pixels=3,
get_position="geometry.coordinates",
get_radius="10",
get_fill_color="[230,85,13]",
)
r = pdk.Deck(layers=[chill_layer, top_chill_layer], initial_view_state=lt_view, tooltip ={ "html": "{location}: {windChill}"})
r.to_html()
from shapely.geometry import Point
from shapely.ops import nearest_points
all_points = morning_forecast.geometry.unary_union
def get_chill_factor(lat, lon):
point = Point(lat, lon)
nearest = nearest_points(point, all_points)[1]
nearest_chill_factor = morning_forecast[morning_forecast.geometry == nearest]['windChill']
return -nearest_chill_factor.values[0]
get_chill_factor(24,55)
5.262881288455483
from bayes_opt import BayesianOptimization
param_bounds = {
'lat': (bounds[0], bounds[2]),
'lon': (bounds[1], bounds[3])
}
optimizer = BayesianOptimization(
f=get_chill_factor,
pbounds=param_bounds,
random_state=42,
verbose=False
)
optimizer.maximize(
init_points=4,
n_iter=15,
acq = 'ei',
xi=0
)
explorations = gpd.GeoDataFrame([{'target': d['target'],
'lat': d['params']['lat'],
'lon': d['params']['lon']} for d in optimizer.res])
#type conversion lat/lon to geometry column
points = [Point(lon, lat) for i,lat,lon in explorations[['lon', 'lat']].itertuples()]
explorations = explorations.assign(**{'geometry': gpd.GeoSeries(points).set_crs('EPSG:4326')})
explorations = explorations.drop(['lat', 'lon'], axis = 1)
point_arr = [[p.x, p.y] for p in points[4:]]
path = pd.DataFrame([{"path": point_arr}])
chill_layer = pdk.Layer(
'HeatmapLayer',
morning_forecast,
opacity=0.5,
get_position="geometry.coordinates",
aggregation=pdk.types.String("MEAN"),
color_range=[[c*255 for c in colorscale(i).clamped_rgb] for i in [0,0.5,1]],
threshold=1,
get_weight="norm_windChill",
pickable=True,
)
exp_layer = pdk.Layer("ScatterplotLayer", explorations,
pickable=True, filled=True, radius_min_pixels=3,
get_position="geometry.coordinates",
get_radius="10",
get_fill_color="[255,255,255]"
)
best_found = pdk.Layer("ScatterplotLayer", explorations[explorations['target'] == explorations['target'].max()],
pickable=True, filled=True, radius_min_pixels=3,
get_position="geometry.coordinates",
get_radius="10",
get_fill_color="[44,162,95]"
)
path_layer = pdk.Layer(
type="PathLayer",
data=path,
pickable=True,
get_color="[255,255,255]",
width_scale=20,
width_min_pixels=2,
get_path="path",
get_width=5,
)
top_chill_layer = pdk.Layer("ScatterplotLayer", coldest_place,
pickable=True, filled=True, radius_min_pixels=3,
get_position="geometry.coordinates",
get_radius="10",
get_fill_color="[230,85,13]"
)
r = pdk.Deck(layers=[chill_layer, exp_layer, top_chill_layer, path_layer, best_found], initial_view_state=lt_view, tooltip ={ "html": "{location}: {windChill}"})
r.to_html()
print("Lowest chill factor found: {:.5f}".format(-explorations['target'].max()))
Lowest chill factor found: -11.58668